import importlib
from typing import Dict
import numpy as np
from matplotlib import pyplot as plt
from shared.definitions import TuningResult
from shared.ml_config_core import ModelTrainingResult, TestTrainData
from shared import pipeline, stats_utils
from shared.ml_config_runner import build_production_model_for_tuning_result
from src import data_loader
from shared import graph
from shared import utils
import seaborn as sns
import pandas as pd
from src.utils import TargetType
utils.pandas_config(pd)
utils.plt_config(plt)
sns.set_theme(style="darkgrid", palette="pastel")
plt.style.use("fivethirtyeight")
SELECTED_MODEL = "XGBoostOrdinalRegressor_Default"
INCLUDE_MODELS = [
# "XGBoostOrdinalRegressor_Default",
# "XGBoostMulticlassTunePRAUC",
# "XGBoostMulticlassTuneLogLoss",
"XGBoostF1Multiclass",
]
VERBOSE = True
importlib.reload(data_loader)
transformed_data = data_loader.load_processed_dataset(
sample_size=data_loader.SampleSize.Small,
target_col="grade",
target_type=TargetType.MulticlassOrdinal,
drop_cols=["loan_status", "sub_grade", "int_rate"],
)
transformed_data_simplified_grade = transformed_data.copy()
transformed_data_simplified_grade["target__grade"] = np.where(
transformed_data_simplified_grade["target__grade"].isin([6, 5, 4]),
4,
transformed_data_simplified_grade["target__grade"],
)
if VERBOSE:
print(f"Total samples loaded : {len(transformed_data)}")
Dropping cols where: grade is missing 90000 -> 89996
With transform="pandas", `func` should return a DataFrame to follow the set_output API.
Total samples loaded : 89996
cv_results_all_models: Dict[str, ModelTrainingResult] = {}
for model_key in INCLUDE_MODELS:
tuning_result = TuningResult.load_serialized_tuning_result(model_key)
cv_results = build_production_model_for_tuning_result(
tuning_result=tuning_result, df=transformed_data
)
cv_results_all_models[model_key] = cv_results
ModelTrainingResult.serialize_model(cv_results, model_key)
if VERBOSE:
display(cv_results_all_models["XGBoostF1Multiclass"].test_data.test_model)
Pipeline(steps=[('preprocessing',
FunctionTransformer(func=<function convert_to_category at 0x7f1aa69abd00>)),
('feat_trans_loan_grade', LoanGradeTransformer()),
('feat_trans_dti_inc_joint', JointApplicationTransformer()),
('feat_trans_fico_score', FICOScoreTransformer()),
('feat_trans_delinquency', DelinquencyTransformer()),
('feat_trans_inst_income_ratio', Ins...
feature_types=None, gamma=0.1, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=0.3,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=7, max_leaves=None,
min_child_weight=2.5, missing=nan,
monotone_constraints=None, multi_strategy=None,
n_estimators=150, n_jobs=None,
num_parallel_tree=None,
objective='multi:softprob', ...))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessing',
FunctionTransformer(func=<function convert_to_category at 0x7f1aa69abd00>)),
('feat_trans_loan_grade', LoanGradeTransformer()),
('feat_trans_dti_inc_joint', JointApplicationTransformer()),
('feat_trans_fico_score', FICOScoreTransformer()),
('feat_trans_delinquency', DelinquencyTransformer()),
('feat_trans_inst_income_ratio', Ins...
feature_types=None, gamma=0.1, grow_policy=None,
importance_type=None,
interaction_constraints=None, learning_rate=0.3,
max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None,
max_depth=7, max_leaves=None,
min_child_weight=2.5, missing=nan,
monotone_constraints=None, multi_strategy=None,
n_estimators=150, n_jobs=None,
num_parallel_tree=None,
objective='multi:softprob', ...))])FunctionTransformer(func=<function convert_to_category at 0x7f1aa69abd00>)
LoanGradeTransformer()
JointApplicationTransformer()
FICOScoreTransformer()
DelinquencyTransformer()
InstallementIncomeRatio()
NewDtiTransformer()
DummyDropAllButFICOHigh(option=<Options.OFF: 0>)
FunctionTransformer(func=<function get_pipeline.<locals>.remove_columns_with_prefix at 0x7f1aa4fc6e60>,
kw_args={'prefix': 'target__'})XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device='cpu', early_stopping_rounds=None,
enable_categorical=True, eval_metric=None, feature_types=None,
gamma=0.1, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=0.3, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=7, max_leaves=None,
min_child_weight=2.5, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=150, n_jobs=None,
num_parallel_tree=None, objective='multi:softprob', ...)def plot_regressor_actual_vs_predicted(data: TestTrainData, model_name: str):
"""
Renders a scatter plot of actual vs. predicted values with improved styles and annotations.
Parameters:
- data: TestTrainData instance containing y_test and predictions.
"""
if isinstance(data.predictions, pd.DataFrame):
predictions_series = data.predictions.iloc[:, 0]
else:
predictions_series = data.predictions
plot_data = pd.DataFrame({"Actual": data.y_test, "Predicted": predictions_series})
# Select a subset if the dataset is very large
if len(plot_data) > 10000:
plot_data = plot_data.sample(10000, random_state=42)
plt.figure(figsize=(10, 8))
sns.scatterplot(
data=plot_data, x="Actual", y="Predicted", alpha=0.3, edgecolor=None
)
ax_min = min(plot_data["Actual"].min(), plot_data["Predicted"].min())
ax_max = max(plot_data["Actual"].max(), plot_data["Predicted"].max())
plt.plot([ax_min, ax_max], [ax_min, ax_max], color="red", lw=2, linestyle="--")
plt.title(f"{model_name} Actual vs. Predicted Values")
plt.xlabel("Actual Value")
plt.ylabel("Predicted Value")
# plt.text(ax_min, ax_max, 'Diagonal line: Perfect Predictions', color='red', ha='left', va='bottom')
plt.tight_layout()
plt.show()
for k, model_data in cv_results_all_models.items():
if "Regressor" in k:
test_data = model_data.test_data
plot_regressor_actual_vs_predicted(test_data, k)
for k, model_data in cv_results_all_models.items():
if "Regressor" in k:
plt.figure(figsize=(11, 9))
test_data = model_data.test_data
joint_plot = sns.jointplot(
x=test_data.y_test, y=test_data.predictions, kind="hex", color="#4CB391"
)
joint_plot.fig.suptitle(f"{k}", fontsize=16, y=1.03)
joint_plot.set_axis_labels("Actual Grade", "Predicted", fontsize=14)
plt.show()
<Figure size 1100x900 with 0 Axes>
for k, model_data in cv_results_all_models.items():
test_data = model_data.test_data
plt.figure(figsize=(10, 8))
ax = sns.violinplot(
x=test_data.y_test, y=test_data.predictions, inner="quart", fill=False
)
ax.set_title(f"{k}")
predictions_rounded = cv_results_all_models[
"XGBoostOrdinalRegressor_Default"
].test_data.predictions
predictions_rounded = predictions_rounded.round()
%matplotlib inline
importlib.reload(graph)
def render_multiclass_confusion_matrices(all_models):
n = len(all_models)
columns = 2
rows = (n + 1) // columns
height = 18
width = height * columns
fig, axes = plt.subplots(
rows, columns, figsize=(width, height * rows), constrained_layout=True
)
plt.suptitle("Confusion Matrices: Best Models based on f1", fontsize=20)
axes_flat = axes.flatten()
for i, model_key in enumerate(all_models.keys()):
graph.confusion_matrix_plot_v2(
all_models[model_key].test_data,
title=model_key,
ax=axes_flat[i],
regressor_input=True,
)
for j in range(i + 1, len(axes_flat)):
axes_flat[j].axis("off")
plt.show()
importlib.reload(stats_utils)
def render_importance_charts(data, all_models):
for model_key in all_models.keys():
model_config = all_models[model_key]
feature_importances = stats_utils.get_model_feature_importances(
model_config, data
)
graph.render_feature_importances_chart(
feature_importances=feature_importances,
title=f"{model_key} Importances",
)
if VERBOSE:
display(feature_importances)
render_multiclass_confusion_matrices(cv_results_all_models)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1471: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. _warn_prf(average, modifier, msg_start, len(result))
render_importance_charts(cv_results_all_models)
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(
| Feature | Importance | |
|---|---|---|
| 0 | term_parser__term | 0.200921 |
| 1 | emp_length_parser__emp_length | 0.000000 |
| 2 | zip__zip_code | 0.014028 |
| 3 | pass__loan_amnt | 0.019779 |
| 4 | pass__installment | 0.028261 |
| 5 | pass__home_ownership | 0.000000 |
| 6 | pass__annual_inc | 0.010391 |
| 7 | pass__verification_status | 0.052370 |
| 8 | pass__purpose | 0.033422 |
| 9 | pass__addr_state | 0.000000 |
| 10 | pass__dti | 0.016340 |
| 11 | pass__delinq_2yrs | 0.000000 |
| 12 | pass__fico_range_low | 0.203827 |
| 13 | pass__fico_range_high | 0.000000 |
| 14 | pass__inq_last_6mths | 0.041348 |
| 15 | pass__mths_since_last_delinq | 0.000000 |
| 16 | pass__mths_since_last_record | 0.000000 |
| 17 | pass__open_acc | 0.000000 |
| 18 | pass__pub_rec | 0.000000 |
| 19 | pass__revol_bal | 0.020648 |
| 20 | pass__revol_util | 0.038275 |
| 21 | pass__total_acc | 0.028199 |
| 22 | pass__initial_list_status | 0.046777 |
| 23 | pass__last_fico_range_high | 0.089089 |
| 24 | pass__last_fico_range_low | 0.000000 |
| 25 | pass__collections_12_mths_ex_med | 0.000000 |
| 26 | pass__mths_since_last_major_derog | 0.000000 |
| 27 | pass__application_type | 0.000000 |
| 28 | pass__verification_status_joint | 0.000000 |
| 29 | pass__inq_fi | 0.015884 |
| 30 | pass__inq_last_12m | 0.017836 |
| 31 | pass__chargeoff_within_12_mths | 0.000000 |
| 32 | pass__mort_acc | 0.016369 |
| 33 | pass__pub_rec_bankruptcies | 0.000000 |
| 34 | pass__tax_liens | 0.000000 |
| 35 | pass__tot_hi_cred_lim | 0.019794 |
| 36 | pass__total_bal_ex_mort | 0.000970 |
| 37 | installment_income_ratio | 0.053909 |
| 38 | new_dti | 0.031560 |
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(
| Feature | Importance | |
|---|---|---|
| 0 | term_parser__term | 0.271462 |
| 1 | emp_length_parser__emp_length | 0.008289 |
| 2 | zip__zip_code | 0.106345 |
| 3 | pass__loan_amnt | 0.061711 |
| 4 | pass__installment | 0.046212 |
| 5 | pass__home_ownership | 0.008468 |
| 6 | pass__annual_inc | 0.012985 |
| 7 | pass__verification_status | 0.016868 |
| 8 | pass__purpose | 0.023965 |
| 9 | pass__addr_state | 0.015981 |
| 10 | pass__dti | 0.015480 |
| 11 | pass__delinq_2yrs | 0.011471 |
| 12 | pass__fico_range_low | 0.085221 |
| 13 | pass__fico_range_high | 0.000000 |
| 14 | pass__inq_last_6mths | 0.023751 |
| 15 | pass__mths_since_last_delinq | 0.009172 |
| 16 | pass__mths_since_last_record | 0.010692 |
| 17 | pass__open_acc | 0.008931 |
| 18 | pass__pub_rec | 0.009866 |
| 19 | pass__revol_bal | 0.010371 |
| 20 | pass__revol_util | 0.010980 |
| 21 | pass__total_acc | 0.010002 |
| 22 | pass__initial_list_status | 0.048485 |
| 23 | pass__last_fico_range_high | 0.020814 |
| 24 | pass__last_fico_range_low | 0.000000 |
| 25 | pass__collections_12_mths_ex_med | 0.013943 |
| 26 | pass__mths_since_last_major_derog | 0.009580 |
| 27 | pass__application_type | 0.013948 |
| 28 | pass__verification_status_joint | 0.014838 |
| 29 | pass__inq_fi | 0.014440 |
| 30 | pass__inq_last_12m | 0.014292 |
| 31 | pass__chargeoff_within_12_mths | 0.013176 |
| 32 | pass__mort_acc | 0.010901 |
| 33 | pass__pub_rec_bankruptcies | 0.012358 |
| 34 | pass__tax_liens | 0.013203 |
| 35 | pass__tot_hi_cred_lim | 0.012728 |
| 36 | pass__total_bal_ex_mort | 0.009067 |
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(
| Feature | Importance | |
|---|---|---|
| 0 | term_parser__term | 0.186552 |
| 1 | emp_length_parser__emp_length | 0.008174 |
| 2 | zip__zip_code | 0.066410 |
| 3 | pass__loan_amnt | 0.069225 |
| 4 | pass__installment | 0.054875 |
| 5 | pass__home_ownership | 0.008708 |
| 6 | pass__annual_inc | 0.017367 |
| 7 | pass__verification_status | 0.024303 |
| 8 | pass__purpose | 0.027819 |
| 9 | pass__addr_state | 0.014011 |
| 10 | pass__dti | 0.019013 |
| 11 | pass__delinq_2yrs | 0.010187 |
| 12 | pass__fico_range_low | 0.124549 |
| 13 | pass__fico_range_high | 0.000000 |
| 14 | pass__inq_last_6mths | 0.032111 |
| 15 | pass__mths_since_last_delinq | 0.009185 |
| 16 | pass__mths_since_last_record | 0.010386 |
| 17 | pass__open_acc | 0.008975 |
| 18 | pass__pub_rec | 0.007683 |
| 19 | pass__revol_bal | 0.011625 |
| 20 | pass__revol_util | 0.013283 |
| 21 | pass__total_acc | 0.011336 |
| 22 | pass__initial_list_status | 0.055525 |
| 23 | pass__last_fico_range_high | 0.029334 |
| 24 | pass__last_fico_range_low | 0.000000 |
| 25 | pass__collections_12_mths_ex_med | 0.011566 |
| 26 | pass__mths_since_last_major_derog | 0.010041 |
| 27 | pass__application_type | 0.018512 |
| 28 | pass__annual_inc_joint | 0.011335 |
| 29 | pass__dti_joint | 0.017017 |
| 30 | pass__verification_status_joint | 0.013634 |
| 31 | pass__inq_fi | 0.017480 |
| 32 | pass__inq_last_12m | 0.016551 |
| 33 | pass__chargeoff_within_12_mths | 0.006341 |
| 34 | pass__mort_acc | 0.012275 |
| 35 | pass__pub_rec_bankruptcies | 0.012554 |
| 36 | pass__tax_liens | 0.008045 |
| 37 | pass__tot_hi_cred_lim | 0.014740 |
| 38 | pass__total_bal_ex_mort | 0.009274 |
While the performance for A,B,C grades is relatively acceptable (F1 > ~0.8) performance when predicting the lower grades is very poor (especially for grade G which is almost never classified correctly).
As we have learnt when building our default risk model there difference in returns and other features does not vary as much for lower quality grades which might make it hard to distinguish them. Therefore we'll use the same approach and merge E-F-G grades into a single group.
Merging E-F-G loan grades into a single group¶
cv_results_all_models_simplified_grade: Dict[str, ModelTrainingResult] = {}
for model_key in INCLUDE_MODELS:
tuning_result = TuningResult.load_serialized_tuning_result(model_key)
cv_results = build_production_model_for_tuning_result(
tuning_result=tuning_result, df=transformed_data_simplified_grade
)
cv_results_all_models_simplified_grade[model_key] = cv_results
ModelTrainingResult.serialize_model(cv_results, model_key)
Training: XGBoostF1Multiclass with: {'feat_trans_delinquency__option': 0, 'feat_trans_dti_inc_joint__option': 0, 'feat_trans_dummy_DROP_ALL_BUT_FICO_HIGH__option': 0, 'feat_trans_fico_score__option': 0, 'feat_trans_inst_income_ratio__option': 0, 'feat_trans_loan_grade__option': 0, 'feat_trans_new_dti_after_loan__option': 0, 'model__n_estimators': 150, 'model__min_child_weight': 2.5, 'model__max_depth': 7, 'model__learning_rate': 0.3, 'model__gamma': 0.1}
XGBoostF1Multiclass: 60.8 seconds
render_multiclass_confusion_matrices(cv_results_all_models_simplified_grade)
render_importance_charts(cv_results_all_models_simplified_grade)
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(
| Feature | Importance | |
|---|---|---|
| 0 | term_parser__term | 0.200921 |
| 1 | emp_length_parser__emp_length | 0.000000 |
| 2 | zip__zip_code | 0.014028 |
| 3 | pass__loan_amnt | 0.019779 |
| 4 | pass__installment | 0.028261 |
| 5 | pass__home_ownership | 0.000000 |
| 6 | pass__annual_inc | 0.010391 |
| 7 | pass__verification_status | 0.052370 |
| 8 | pass__purpose | 0.033422 |
| 9 | pass__addr_state | 0.000000 |
| 10 | pass__dti | 0.016340 |
| 11 | pass__delinq_2yrs | 0.000000 |
| 12 | pass__fico_range_low | 0.203827 |
| 13 | pass__fico_range_high | 0.000000 |
| 14 | pass__inq_last_6mths | 0.041348 |
| 15 | pass__mths_since_last_delinq | 0.000000 |
| 16 | pass__mths_since_last_record | 0.000000 |
| 17 | pass__open_acc | 0.000000 |
| 18 | pass__pub_rec | 0.000000 |
| 19 | pass__revol_bal | 0.020648 |
| 20 | pass__revol_util | 0.038275 |
| 21 | pass__total_acc | 0.028199 |
| 22 | pass__initial_list_status | 0.046777 |
| 23 | pass__last_fico_range_high | 0.089089 |
| 24 | pass__last_fico_range_low | 0.000000 |
| 25 | pass__collections_12_mths_ex_med | 0.000000 |
| 26 | pass__mths_since_last_major_derog | 0.000000 |
| 27 | pass__application_type | 0.000000 |
| 28 | pass__verification_status_joint | 0.000000 |
| 29 | pass__inq_fi | 0.015884 |
| 30 | pass__inq_last_12m | 0.017836 |
| 31 | pass__chargeoff_within_12_mths | 0.000000 |
| 32 | pass__mort_acc | 0.016369 |
| 33 | pass__pub_rec_bankruptcies | 0.000000 |
| 34 | pass__tax_liens | 0.000000 |
| 35 | pass__tot_hi_cred_lim | 0.019794 |
| 36 | pass__total_bal_ex_mort | 0.000970 |
| 37 | installment_income_ratio | 0.053909 |
| 38 | new_dti | 0.031560 |
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(
| Feature | Importance | |
|---|---|---|
| 0 | term_parser__term | 0.271462 |
| 1 | emp_length_parser__emp_length | 0.008289 |
| 2 | zip__zip_code | 0.106345 |
| 3 | pass__loan_amnt | 0.061711 |
| 4 | pass__installment | 0.046212 |
| 5 | pass__home_ownership | 0.008468 |
| 6 | pass__annual_inc | 0.012985 |
| 7 | pass__verification_status | 0.016868 |
| 8 | pass__purpose | 0.023965 |
| 9 | pass__addr_state | 0.015981 |
| 10 | pass__dti | 0.015480 |
| 11 | pass__delinq_2yrs | 0.011471 |
| 12 | pass__fico_range_low | 0.085221 |
| 13 | pass__fico_range_high | 0.000000 |
| 14 | pass__inq_last_6mths | 0.023751 |
| 15 | pass__mths_since_last_delinq | 0.009172 |
| 16 | pass__mths_since_last_record | 0.010692 |
| 17 | pass__open_acc | 0.008931 |
| 18 | pass__pub_rec | 0.009866 |
| 19 | pass__revol_bal | 0.010371 |
| 20 | pass__revol_util | 0.010980 |
| 21 | pass__total_acc | 0.010002 |
| 22 | pass__initial_list_status | 0.048485 |
| 23 | pass__last_fico_range_high | 0.020814 |
| 24 | pass__last_fico_range_low | 0.000000 |
| 25 | pass__collections_12_mths_ex_med | 0.013943 |
| 26 | pass__mths_since_last_major_derog | 0.009580 |
| 27 | pass__application_type | 0.013948 |
| 28 | pass__verification_status_joint | 0.014838 |
| 29 | pass__inq_fi | 0.014440 |
| 30 | pass__inq_last_12m | 0.014292 |
| 31 | pass__chargeoff_within_12_mths | 0.013176 |
| 32 | pass__mort_acc | 0.010901 |
| 33 | pass__pub_rec_bankruptcies | 0.012358 |
| 34 | pass__tax_liens | 0.013203 |
| 35 | pass__tot_hi_cred_lim | 0.012728 |
| 36 | pass__total_bal_ex_mort | 0.009067 |
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(
| Feature | Importance | |
|---|---|---|
| 0 | term_parser__term | 0.186552 |
| 1 | emp_length_parser__emp_length | 0.008174 |
| 2 | zip__zip_code | 0.066410 |
| 3 | pass__loan_amnt | 0.069225 |
| 4 | pass__installment | 0.054875 |
| 5 | pass__home_ownership | 0.008708 |
| 6 | pass__annual_inc | 0.017367 |
| 7 | pass__verification_status | 0.024303 |
| 8 | pass__purpose | 0.027819 |
| 9 | pass__addr_state | 0.014011 |
| 10 | pass__dti | 0.019013 |
| 11 | pass__delinq_2yrs | 0.010187 |
| 12 | pass__fico_range_low | 0.124549 |
| 13 | pass__fico_range_high | 0.000000 |
| 14 | pass__inq_last_6mths | 0.032111 |
| 15 | pass__mths_since_last_delinq | 0.009185 |
| 16 | pass__mths_since_last_record | 0.010386 |
| 17 | pass__open_acc | 0.008975 |
| 18 | pass__pub_rec | 0.007683 |
| 19 | pass__revol_bal | 0.011625 |
| 20 | pass__revol_util | 0.013283 |
| 21 | pass__total_acc | 0.011336 |
| 22 | pass__initial_list_status | 0.055525 |
| 23 | pass__last_fico_range_high | 0.029334 |
| 24 | pass__last_fico_range_low | 0.000000 |
| 25 | pass__collections_12_mths_ex_med | 0.011566 |
| 26 | pass__mths_since_last_major_derog | 0.010041 |
| 27 | pass__application_type | 0.018512 |
| 28 | pass__annual_inc_joint | 0.011335 |
| 29 | pass__dti_joint | 0.017017 |
| 30 | pass__verification_status_joint | 0.013634 |
| 31 | pass__inq_fi | 0.017480 |
| 32 | pass__inq_last_12m | 0.016551 |
| 33 | pass__chargeoff_within_12_mths | 0.006341 |
| 34 | pass__mort_acc | 0.012275 |
| 35 | pass__pub_rec_bankruptcies | 0.012554 |
| 36 | pass__tax_liens | 0.008045 |
| 37 | pass__tot_hi_cred_lim | 0.014740 |
| 38 | pass__total_bal_ex_mort | 0.009274 |
Classifying Subgrades¶
importlib.reload(data_loader)
transformed_data_subgrade = data_loader.load_processed_dataset(
sample_size=data_loader.SampleSize.Small,
target_col="sub_grade",
target_type=TargetType.MulticlassOrdinalExtended,
drop_cols=["loan_status", "grade", "int_rate"],
)
if VERBOSE:
print(f"Total samples loaded : {len(transformed_data_subgrade)}")
Dropping cols where: sub_grade is missing 90000 -> 89996
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/sklearn/preprocessing/_function_transformer.py:345: UserWarning: With transform="pandas", `func` should return a DataFrame to follow the set_output API. warnings.warn(
Total samples loaded : 89996
cv_results_all_models_sub_grade: Dict[str, ModelTrainingResult] = {}
for model_key in INCLUDE_MODELS:
tuning_result = TuningResult.load_serialized_tuning_result(model_key)
cv_results = build_production_model_for_tuning_result(
tuning_result=tuning_result, df=transformed_data_subgrade
)
cv_results_all_models_sub_grade[model_key] = cv_results
ModelTrainingResult.serialize_model(cv_results, model_key)
/home/paulius/miniconda3/envs/rapids_v2/lib/python3.10/site-packages/sklearn/metrics/_classification.py:2922: UserWarning: The y_pred values do not sum to one. Starting from 1.5 thiswill result in an error. warnings.warn(
Training: XGBoostF1Multiclass with: {'feat_trans_delinquency__option': 0, 'feat_trans_dti_inc_joint__option': 0, 'feat_trans_dummy_DROP_ALL_BUT_FICO_HIGH__option': 0, 'feat_trans_fico_score__option': 0, 'feat_trans_inst_income_ratio__option': 0, 'feat_trans_loan_grade__option': 0, 'feat_trans_new_dti_after_loan__option': 0, 'model__n_estimators': 150, 'model__min_child_weight': 2.5, 'model__max_depth': 7, 'model__learning_rate': 0.3, 'model__gamma': 0.1}
XGBoostF1Multiclass: 293.7 seconds
Predicting Sub-grades¶
We've attempted to build a model which predicts individual sub-grades (e.g. A1, A2 ... G5) in addition to top level grades. An XGBoost multi-classificaiton model was used, however the performance was unsatisfactory. We've included the overal performance summary and feature importances below. However, we decide to not provide an indepth analysis because the model would not be useful for any practical applications (a different approach would probably suit this problem better due to the high number of classes and their ordinal nature)
for model_key, model_results in cv_results_all_models_sub_grade.items():
print(model_key)
print(model_results.test_data.metrics)
XGBoostF1Multiclass
{'f1': 0.199, 'accuracy': 0.2733, 'precision': 0.2108, 'recall': 0.1947, 'log_loss': 2}
None
render_importance_charts(transformed_data_subgrade, cv_results_all_models_sub_grade)
/home/paulius/data/projects/m3_s3_lending/shared/graph.py:457: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator. ax.set_yticklabels(
| Feature | Importance | |
|---|---|---|
| 0 | term_parser__term | 0.106485 |
| 1 | emp_length_parser__emp_length | 0.016260 |
| 2 | zip__zip_code | 0.063670 |
| 3 | pass__loan_amnt | 0.048561 |
| 4 | pass__installment | 0.046285 |
| 5 | pass__home_ownership | 0.017024 |
| 6 | pass__annual_inc | 0.020558 |
| 7 | pass__verification_status | 0.024662 |
| 8 | pass__purpose | 0.026665 |
| 9 | pass__addr_state | 0.025046 |
| 10 | pass__dti | 0.021775 |
| 11 | pass__delinq_2yrs | 0.017929 |
| 12 | pass__fico_range_low | 0.065871 |
| 13 | pass__fico_range_high | 0.000000 |
| 14 | pass__inq_last_6mths | 0.027946 |
| 15 | pass__mths_since_last_delinq | 0.017716 |
| 16 | pass__mths_since_last_record | 0.019174 |
| 17 | pass__open_acc | 0.016289 |
| 18 | pass__pub_rec | 0.020212 |
| 19 | pass__revol_bal | 0.018316 |
| 20 | pass__revol_util | 0.019093 |
| 21 | pass__total_acc | 0.017927 |
| 22 | pass__initial_list_status | 0.033912 |
| 23 | pass__last_fico_range_high | 0.024567 |
| 24 | pass__last_fico_range_low | 0.000000 |
| 25 | pass__collections_12_mths_ex_med | 0.016248 |
| 26 | pass__mths_since_last_major_derog | 0.017968 |
| 27 | pass__application_type | 0.026973 |
| 28 | pass__annual_inc_joint | 0.021111 |
| 29 | pass__dti_joint | 0.022676 |
| 30 | pass__verification_status_joint | 0.022423 |
| 31 | pass__inq_fi | 0.023245 |
| 32 | pass__inq_last_12m | 0.022318 |
| 33 | pass__chargeoff_within_12_mths | 0.018417 |
| 34 | pass__mort_acc | 0.017800 |
| 35 | pass__pub_rec_bankruptcies | 0.020849 |
| 36 | pass__tax_liens | 0.016260 |
| 37 | pass__tot_hi_cred_lim | 0.020096 |
| 38 | pass__total_bal_ex_mort | 0.017674 |